Agricultural Insurance Loss Clustering Example: Water Scarcity Damage Causes

Declaring commodity, year, unit, and damage cause grouping

Unit options include: Lossperacre, Lossperclaim, Loss, Acres

Clustering data setup

#agloss_supervised.R
#creates a map of an unsupervised model of damage cause by county,
#for a select year, commodity, and unit.
#
#unit examples:
#
#Loss
#Lossperacre
#Lossperclaim
#
#Example:
#
#damagetype options: waterscarcity OR waterexcess
#agloss_unsupervised(2014, WHEAT, Lossperacres, waterscarcity)

  #access data from nextcloud
  #damage <- fread("https://nextcloud.sesync.org/index.php/s/W5kztdb5ZkxRptT/download", header = TRUE)
  
  #access data from local UI server
  #damage <- read.csv("/dmine/code/git/dmine-clustering/damage.csv")
  #damage <- damage[,3:13]
   
  
  #access data from google drive
  id <- "1RxLEi0DiwMZFIw5CSBaEFjlLG67bKyLV" # google file ID
  damage <- fread(sprintf("https://docs.google.com/uc?id=%s&export=download", id), header = TRUE)
  damage <- damage[,2:12]
  
  #access data from local sesync server
  #setwd("/nfs/soilsesfeedback-data/data/RMA/")
  #damage <- read.csv("RMA_damage_combined.csv", strip.white=TRUE)
  
  #used for data from cloud
  #colnames(damage) <- c("ID", "Year", "State", "County", "Commodity", "Damagecause", "Loss", "Count", "Acres", "Lossperacre", "Lossperclaim", "Acresperclaim")
  #damage$State <- state.name[match(damage$State,state.abb)]
  
  colnames(damage) <- c("Year", "State", "County", "Commodity", "Damagecause", "Loss", "Count", "Acres", "Lossperacre", "Lossperclaim", "Acresperclaim")
  damage$State <- state.name[match(damage$State,state.abb)]
  
  
  library(reshape2) ; damage_loss <- dcast(damage, State + County + Year + Commodity ~ Damagecause, value.var = c(units), sum)
## 
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
  damage_loss <- subset(damage_loss, Commodity == crops)
  damage_loss <- subset(damage_loss, Year == years)
  
  #select only damage causes that are associates with Water Scarcity
  damage_loss <- subset(damage_loss, select= waterscarcity)
  damage_loss <- transform(damage_loss, ID=paste(State, County, sep="-"))
  rownames(damage_loss) <- damage_loss$ID
  
  damage_loss <- damage_loss[,5:length(waterscarcity)]
  
  #remove damage causes that have NO values for ANY counties
  damage_loss <- damage_loss[, colSums(damage_loss != 0) > 0]
  #identify the optimum number of clusters using NbClust
  par(mar=c(1,1,1,1))
  library("NbClust")
  clust<-NbClust(damage_loss[,1:(length(waterscarcity)-4)], diss=NULL, distance = clustering_distance, min.nc=2, max.nc=20, method = "kmeans", index = "all") 

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 3 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 8 proposed 8 as the best number of clusters 
## * 3 proposed 13 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 1 proposed 17 as the best number of clusters 
## * 2 proposed 18 as the best number of clusters 
## * 1 proposed 19 as the best number of clusters 
## * 1 proposed 20 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  8 
##  
##  
## *******************************************************************
  #res$All.index
  #res$Best.nc
  #res$All.CriticalValues
  #res$Best.partition
  
  maxclusters <- length(unique(clust$Best.partition))
  
  km.res <- kmeans(damage_loss, maxclusters, nstart = 25)
  
  pam.res <- pam(damage_loss, maxclusters)
  
  hc.cut <- hcut(damage_loss, k = maxclusters, hc_method = "complete")

Visualize clusters

  library("factoextra")
  clusterplot <- fviz_cluster(km.res, data = damage_loss, palette = "Spectral", ellipse.type = "norm", geom = c("point"), main = paste(crops, " ", years, " ", units, sep=""))+
    theme_bw()
  
  clusterplot

Mapping clusters

  counties <- readShapePoly('/dmine/data/counties/UScounties_conus.shp',
                            proj4string=CRS
                            ("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0"))
  projection = CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
  
  
  #--damage_loss
  
  damage_loss_km <- data.frame(km.res$cluster)
  damage_loss_km$State <- rownames(damage_loss_km)
  
  cluster <- data.frame(damage_loss_km$km.res.cluster)
  colnames(cluster) <- c("cluster")
  damage_loss_km <- data.frame(do.call('rbind', strsplit(as.character(damage_loss_km$State),'-',fixed=TRUE)))
  damage_loss_km <- cbind(cluster$cluster, damage_loss_km)
  colnames(damage_loss_km) <- c("Cluster", "STATE_NAME", "NAME")
  
  m <- merge(counties, damage_loss_km, by=c("STATE_NAME", "NAME"))
  
  pal <- colorFactor(brewer.pal(maxclusters, "Spectral"),  na.color = "#ffffff", domain = eval(parse(text=paste("m$", "Cluster", sep=""))))
  
  
  exte <- as.vector(extent(counties))
  
  label <- paste(sep = "<br/>", m$STATE_NAME, round(eval(parse(text=paste("m$", "Cluster", sep=""))), 0))
  markers <- data.frame(label)
  labs <- as.list(eval(parse(text=paste("m$", "Cluster", sep=""))))
  
  library(htmltools)
  
  tag.map.title <- tags$style(HTML("
  .leaflet-control.map-title { 
    transform: translate(-50%,20%);
    position: fixed !important;
    left: 50%;
    text-align: center;
    padding-left: 10px; 
    padding-right: 10px; 
    background: rgba(255,255,255,0.75);
    font-weight: bold;
    font-size: 16px;
  }
"))

title <- tags$div(
  tag.map.title, HTML("KMeans Damage Cause Clustering: Water Scarcity")
)  

  map <- leaflet(data = m) %>% addProviderTiles("Stamen.TonerLite") %>% addControl(title, position = "topleft", className="map-title") %>% fitBounds(exte[1], exte[3], exte[2], exte[4]) %>% addPolygons(color = ~pal(eval(parse(text=paste("m$", "Cluster", sep="")))), popup = markers$label,  weight = 1) %>%
    addLegend(pal = pal, values = ~na.omit(eval(parse(text=paste("m$", "Cluster", sep="")))),  labels = c("1", "2"), opacity = .5, title = paste(years, " ", crops, " ", units, sep=""),
              position = "bottomright")
  
  map